library(DT)
library(dplyr)
library(readxl)
library(plotly)
library(skimr)
library(ggplot2)
library(gridExtra)
file_path = "../data/master_df.xlsx"
df = read_excel(file_path)
datatable(head(df, 10),
options = list(scrollX = TRUE,
pageLength = 5,
autoWidth = TRUE,
searching = FALSE),
caption = "Preview of Trade and Congressional Data")
skim_df = skim(df) %>%
rename(variable_type = skim_type,
variable_name = skim_variable,
missing_values = n_missing,
min_value = POSIXct.min,
max_value = POSIXct.max,
median_value = POSIXct.median,
unique_values = POSIXct.n_unique,
)
datatable(skim_df, options = list(scrollX = TRUE,
pageLength = 10,
autoWidth = TRUE,
searching = FALSE),
caption = "Summary Statistics for Data")
numerical_vars = c("trade_price", "price_change_pct", "seniority_years", "excess_return")
create_histogram = function(var) {ggplot(df, aes(x = .data[[var]])) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", color = "black", alpha = 0.7) +
geom_density(color = "red", size = 1) +
labs(title = paste("Distribution of", var), x = var, y = "Density") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))}
histogram_plots = lapply(numerical_vars, create_histogram)
grid.arrange(grobs = histogram_plots, ncol = 2)
### Visualizations for categorical values
create_interactive_barplot = function(var) {
plot_ly(data = df, y = ~reorder(.data[[var]], table(.data[[var]])[.data[[var]]]),
type = "bar", orientation = "h", marker = list(color = "steelblue")) %>%
layout(title = paste("Distribution of", var), xaxis = list(title = "Count"),
yaxis = list(title = var))}
transaction_plot <- create_interactive_barplot("transaction_type")
party_plot <- create_interactive_barplot("party")
state_plot <- create_interactive_barplot("state")
gender_plot <- create_interactive_barplot("gender")
transaction_plot
party_plot
state_plot
gender_plot
define_trade_success = function(row) {
if (row$transaction_type %in% c("Purchase")) {
return(ifelse(row$price_change_pct > 0, 1, 0))}
else if (row$transaction_type %in% c("Sale", "Sale (Partial)", "Sale (Full)")) {
return(ifelse(row$price_change_pct < 0, 1, 0))}
else {return(NA)}}
df = df %>%
rowwise() %>%
mutate(trade_success = define_trade_success(cur_data())) %>%
ungroup() %>%
filter(!is.na(trade_success))
define_market_success = function(row) {
if (row$transaction_type %in% c("Purchase")) {
return(ifelse(row$excess_return > 0, 1, 0))}
else if (row$transaction_type %in% c("Sale", "Sale (Partial)", "Sale (Full)")) {
return(ifelse(row$excess_return < 0 & row$price_change_pct < 0, 1, 0))}
else {return(NA)}}
df = df %>%
rowwise() %>%
mutate(market_success = define_market_success(cur_data())) %>%
ungroup() %>%
filter(!is.na(market_success))
success_colors = c("0" = "red", "1" = "green")
success_plot = ggplot(df, aes(x = factor(trade_success),
fill = factor(trade_success))) + geom_bar() +
scale_fill_manual(values = success_colors, labels = c("0: Unsuccessful", "1: Successful")) +
labs(title = "Distribution of Trade Success Outcomes",
x = "Trade Success", y = "Count", fill = "Outcome") +
theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "top")
market_success_plot = ggplot(df, aes(x = factor(market_success),
fill = factor(market_success))) + geom_bar() +
scale_fill_manual(values = success_colors, labels = c("0: Unsuccessful", "1: Successful")) +
labs(title = "Distribution of Market Success Outcomes",
x = "Market Success", y = "Count", fill = "Outcome") +
theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "top" )
grid.arrange(success_plot, market_success_plot, ncol = 2)
z = qnorm(0.975)
compute_confidence_interval = function(successes, total) {
p = successes / total
se = sqrt(p * (1 - p) / total)
lower = max(0, p - z * se)
upper = min(1, p + z * se)
return(list(proportion = p, lower_bound = lower, upper_bound = upper))}
committee_columns = grep("committee_", colnames(df), value = TRUE)
committee_success_rates = data.frame(committee = character(),
success_rate = numeric(),
lower_ci = numeric(),
upper_ci = numeric(),
total_trades = integer(),
stringsAsFactors = FALSE)
for (committee in committee_columns) {committee_name = gsub("committee_", "", committee)
committee_data = df %>% filter(.data[[committee]] == 1)
total_trades = nrow(committee_data)
successes = nrow(committee_data %>% filter(trade_success == 1))
if (total_trades > 0) {
ci = compute_confidence_interval(successes, total_trades)
committee_success_rates = rbind(committee_success_rates,
data.frame(committee = committee_name,
success_rate = ci$proportion,
lower_ci = ci$lower_bound,
upper_ci = ci$upper_bound,
total_trades = total_trades))}
else {committee_success_rates = rbind(committee_success_rates,
data.frame(committee = committee_name,
success_rate = NA,
lower_ci = NA,
upper_ci = NA,
total_trades = total_trades))}}
committee_success_rates = committee_success_rates %>% filter(!is.na(success_rate))
committee_success_rates = committee_success_rates %>% arrange(desc(success_rate))
ggplot(committee_success_rates, aes(x = success_rate, y = reorder(committee, success_rate))) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.8) +
geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci),
width = 0.2, color = "black") +
labs(title = "Success Rate by Committee with Confidence Intervals",
x = "Success Rate", y = "Committee") + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 16, margin = margin(b = 10)),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
max_trades = max(committee_success_rates$total_trades, na.rm = TRUE)
committee_success_rates = committee_success_rates %>%
mutate(bubble_size = total_trades / max_trades * 1000)
ggplot(committee_success_rates, aes(x = success_rate, y = reorder(committee, success_rate), size = bubble_size)) +
geom_point(alpha = 0.6, color = "steelblue") +
scale_size_continuous(range = c(3, 20),
name = "Number of Trades",
breaks = c(100, 500, 1000),
labels = c("100 trades", "500 trades", "1000 trades")) +
labs(title = "Success Rate by Committee (Bubble Size = Number of Trades)",
x = "Success Rate", y = "Committee") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "right" )